{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n",
    "os.environ[\"http_proxy\"] = \"xxx\"\n",
    "os.environ[\"https_proxy\"] = \"xxx\"\n",
    "\n",
    "# 数据输入与提取\n",
    "def load_pdf(source_files, file_):\n",
    "    pdf_file = fitz.open(file_) \n",
    "    image_counter = 0\n",
    "    metadata = {}\n",
    "    for page_index in range(0,len(pdf_file)): \n",
    "        page = pdf_file[page_index] \n",
    "    blocks = pdf_file[page_index].get_text(\"blocks\")\n",
    "    image_meta = [ (blocks[i][4], blocks[i+1][4]) for i in range(0,len(blocks)) if blocks[i][-1]==1 ]  \n",
    "    image_info= [ image_meta[0][0] if image_meta  else []] \n",
    "    image_title = [ image_meta[0][1] if image_meta else []] \n",
    "    for image in page.get_images():\n",
    "        image_id = image[7]\n",
    "        image_block_id = image[0] \n",
    "        image_title_block_id = image_block_id+1 \n",
    "        image_dim = image[2],image[3] \n",
    "        image_counter = image_counter+1\n",
    "        metadata[image_counter] = { 'page': page_index, 'image': image_id,\n",
    "            'block': image_block_id,\n",
    "            'image_dim': image_dim,\n",
    "            'image_info': str(image_info[0]),\n",
    "            'image_title': str(image_title[0]),\n",
    "            'image_file': f\"{image_id}_{image_block_id}.png\",\n",
    "            'image_path':os.path.join(source_files,f\"{image_id}_{image_block_id}.png\")\n",
    "        }\n",
    "    pix = fitz.Pixmap(pdf_file, image[0])\n",
    "    pix.save(os.path.join(source_files, f\"{image_id}_{image_block_id}.png\"))\n",
    "    with open(os.path.join(source,f'metadata.json'),'w') as f:\n",
    "        json.dump(metadata,f)\n",
    "        \n",
    "    return metadata\n",
    "    \n",
    "metadata = load_pdf(source, file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 多模态向量存储\n",
    "#  pip install llama-index==0.9.48\n",
    "import qdrant_client\n",
    "from llama_index import ServiceContext,  SimpleDirectoryReader\n",
    "from llama_index.vector_stores.qdrant import QdrantVectorStore\n",
    "from llama_index import VectorStoreIndex, StorageContext\n",
    "from llama_index.indices.multi_modal.base import MultiModalVectorStoreIndex\n",
    "\n",
    "client = qdrant_client.QdrantClient(path=\"qdrant_mm\")\n",
    "text_store = QdrantVectorStore(client=client, collection_name=\"text_collection\")\n",
    "image_store = QdrantVectorStore(\n",
    "    client=client, collection_name=\"image_collection\"\n",
    ")\n",
    "storage_context = StorageContext.from_defaults(\n",
    "    vector_store=text_store, image_store=image_store\n",
    ")\n",
    "documents = SimpleDirectoryReader(\"./source_files/\").load_data()\n",
    "index = MultiModalVectorStoreIndex.from_documents(\n",
    "    documents,\n",
    "    storage_context=storage_context,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 查询与检索\n",
    "from llama_index.multi_modal_llms.openai import OpenAIMultiModal\n",
    "openai_mm_llm = OpenAIMultiModal(model=\"gpt-4-vision-preview\",\n",
    "    max_new_tokens=1500\n",
    ")\n",
    "query_engine = index.as_query_engine(multi_modal_llm=openai_mm_llm,\n",
    "    similarity_top_k=3, \n",
    "    image_similarity_top_k=3\n",
    ")\n",
    "query_str = \"What are the various stages of LLMs from pre-training\"\n",
    "response_mm = query_engine.query(query_str)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
